package udacity.storm; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.StormSubmitter; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.testing.TestWordSpout; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.TopologyBuilder; import backtype.storm.topology.base.BaseRichSpout; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import backtype.storm.utils.Utils; import twitter4j.conf.ConfigurationBuilder; import twitter4j.TwitterStream; import twitter4j.TwitterStreamFactory; import twitter4j.Status; import twitter4j.StatusDeletionNotice; import twitter4j.StatusListener; import twitter4j.StallWarning; import java.util.HashMap; import java.util.Map; import java.util.concurrent.LinkedBlockingQueue; import com.lambdaworks.redis.RedisClient; import com.lambdaworks.redis.RedisConnection; /** * This is a basic example of a Storm topology. */ public class TweetTopology { /** * A spout that uses Twitter streaming API for continuously * getting tweets */ public static class TweetSpout extends BaseRichSpout { // Twitter API authentication credentials String custkey, custsecret; String accesstoken, accesssecret; // To output tuples from spout to the next stage bolt SpoutOutputCollector collector; // Twitter4j - twitter stream to get tweets TwitterStream twitterStream; // Shared queue for getting buffering tweets received LinkedBlockingQueue<String> queue = null; // Class for listening on the tweet stream - for twitter4j private class TweetListener implements StatusListener { // Implement the callback function when a tweet arrives @Override public void onStatus(Status status) { // add the tweet into the queue buffer queue.offer(status.getText()); } @Override public void onDeletionNotice(StatusDeletionNotice sdn) { } @Override public void onTrackLimitationNotice(int i) { } @Override public void onScrubGeo(long l, long l1) { } @Override public void onStallWarning(StallWarning warning) { } @Override public void onException(Exception e) { e.printStackTrace(); } }; /** * Constructor for tweet spout that accepts the credentials */ public TweetSpout( String key, String secret, String token, String tokensecret) { custkey = key; custsecret = secret; accesstoken = token; accesssecret = tokensecret; } @Override public void open( Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) { // create the buffer to block tweets queue = new LinkedBlockingQueue<String>(1000); // save the output collector for emitting tuples collector = spoutOutputCollector; // build the config with credentials for twitter 4j ConfigurationBuilder config = new ConfigurationBuilder() .setOAuthConsumerKey(custkey) .setOAuthConsumerSecret(custsecret) .setOAuthAccessToken(accesstoken) .setOAuthAccessTokenSecret(accesssecret); // create the twitter stream factory with the config TwitterStreamFactory fact = new TwitterStreamFactory(config.build()); // get an instance of twitter stream twitterStream = fact.getInstance(); // provide the handler for twitter stream twitterStream.addListener(new TweetListener()); // start the sampling of tweets twitterStream.sample(); } @Override public void nextTuple() { // try to pick a tweet from the buffer String ret = queue.poll(); // if no tweet is available, wait for 50 ms and return if (ret==null) { Utils.sleep(50); return; } // now emit the tweet to next stage bolt collector.emit(new Values(ret)); } @Override public void close() { // shutdown the stream - when we are going to exit twitterStream.shutdown(); } /** * Component specific configuration */ @Override public Map<String, Object> getComponentConfiguration() { // create the component config Config ret = new Config(); // set the parallelism for this spout to be 1 ret.setMaxTaskParallelism(1); return ret; } @Override public void declareOutputFields( OutputFieldsDeclarer outputFieldsDeclarer) { // tell storm the schema of the output tuple for this spout // tuple consists of a single column called 'tweet' outputFieldsDeclarer.declare(new Fields("tweet")); } } /** * A bolt that parses the tweet into words */ public static class ParseTweetBolt extends BaseRichBolt { // To output tuples from this bolt to the count bolt OutputCollector collector; @Override public void prepare( Map map, TopologyContext topologyContext, OutputCollector outputCollector) { // save the output collector for emitting tuples collector = outputCollector; } @Override public void execute(Tuple tuple) { // get the 1st column 'tweet' from tuple String tweet = tuple.getString(0); // provide the delimiters for splitting the tweet String delims = "[ .,?!]+"; // now split the tweet into tokens String[] tokens = tweet.split(delims); // for each token/word, emit it for (String token: tokens) { collector.emit(new Values(token)); } } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { // tell storm the schema of the output tuple for this spout // tuple consists of a single column called 'tweet-word' declarer.declare(new Fields("tweet-word")); } } /** * A bolt that counts the words that it receives */ static class CountBolt extends BaseRichBolt { // To output tuples from this bolt to the next stage bolts, if any private OutputCollector collector; // Map to store the count of the words private Map<String, Integer> countMap; @Override public void prepare( Map map, TopologyContext topologyContext, OutputCollector outputCollector) { // save the collector for emitting tuples collector = outputCollector; // create and initialize the map countMap = new HashMap<String, Integer>(); } @Override public void execute(Tuple tuple) { // get the word from the 1st column of incoming tuple String word = tuple.getString(0); // check if the word is present in the map if (countMap.get(word) == null) { // not present, add the word with a count of 1 countMap.put(word, 1); } else { // already there, hence get the count Integer val = countMap.get(word); // increment the count and save it to the map countMap.put(word, ++val); } // emit the word and count collector.emit(new Values(word, countMap.get(word))); } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { // tell storm the schema of the output tuple for this spout // tuple consists of a two columns called 'word' and 'count' // declare the first column 'word', second column 'count' outputFieldsDeclarer.declare(new Fields("word","count")); } } /** * A bolt that prints the word and count to redis */ static class ReportBolt extends BaseRichBolt { // place holder to keep the connection to redis transient RedisConnection<String,String> redis; @Override public void prepare( Map map, TopologyContext topologyContext, OutputCollector outputCollector) { // instantiate a redis connection RedisClient client = new RedisClient("localhost",6379); // initiate the actual connection redis = client.connect(); } @Override public void execute(Tuple tuple) { // access the first column 'word' String word = tuple.getStringByField("word"); // access the second column 'count' Integer count = tuple.getIntegerByField("count"); // publish the word count to redis using word as the key redis.publish("WordCountTopology", word + "|" + Long.toString(count)); } public void declareOutputFields(OutputFieldsDeclarer declarer) { // nothing to add - since it is the final bolt } } public static void main(String[] args) throws Exception { // create the topology TopologyBuilder builder = new TopologyBuilder(); /* * In order to create the spout, you need to get twitter credentials * If you need to use Twitter firehose/Tweet stream for your idea, * create a set of credentials by following the instructions at * * https://dev.twitter.com/discussions/631 * */ // now create the tweet spout with the credentials TweetSpout tweetSpout = new TweetSpout( "[Your customer key]", "[Your secret key]", "[Your access token]", "[Your access secret]" ); // attach the tweet spout to the topology - parallelism of 1 builder.setSpout("tweet-spout", tweetSpout, 1); //********************************************************************* // Complete the Topology. // Part 1: // attach the parse tweet bolt, parallelism of 10 (what grouping is needed?) builder.setBolt("parse-tweet-bolt", new ParseTweetBolt(), 10).shuffleGrouping("tweet-spout"); // Part 2: // attach the count bolt, parallelism of 15 (what grouping is needed?) builder.setBolt("count-bolt", new CountBolt(), 15).fieldsGrouping("parse-tweet-bolt", new Fields("tweet-word")); // Part 3: attach the report bolt, parallelism of 1 (what grouping is needed?) builder.setBolt("report-bolt", new ReportBolt(), 1).globalGrouping("count-bolt"); // Submit and run the topology. //********************************************************************* // create the default config object Config conf = new Config(); // set the config in debugging mode conf.setDebug(true); if (args != null && args.length > 0) { // run it in a live cluster // set the number of workers for running all spout and bolt tasks conf.setNumWorkers(3); // create the topology and submit with config StormSubmitter.submitTopology(args[0], conf, builder.createTopology()); } else { // run it in a simulated local cluster // set the number of threads to run - similar to setting number of workers in live cluster conf.setMaxTaskParallelism(3); // create the local cluster instance LocalCluster cluster = new LocalCluster(); // submit the topology to the local cluster cluster.submitTopology("tweet-word-count", conf, builder.createTopology()); // let the topology run for 30 seconds. note topologies never terminate! Utils.sleep(30000); // now kill the topology cluster.killTopology("tweet-word-count"); // we are done, so shutdown the local cluster cluster.shutdown(); } } }